Gender prediction on USA baby names


In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn import cross_validation
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, LabelBinarizer

from sklearn.linear_model import LogisticRegression
from sklearn.cluster.hierarchical import AgglomerativeClustering

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")
sns.set_context("poster")

In [2]:
df = pd.read_csv("data/SSN_names_state.txt", header=None)

In [3]:
df.head()


Out[3]:
0 1 2 3 4
0 AK F 1910 Mary 14
1 AK F 1910 Annie 12
2 AK F 1910 Anna 10
3 AK F 1910 Margaret 8
4 AK F 1910 Helen 7

In [4]:
df.columns = ["State", "Gender", "Year", "Name", "Count"]

In [5]:
df.head()


Out[5]:
State Gender Year Name Count
0 AK F 1910 Mary 14
1 AK F 1910 Annie 12
2 AK F 1910 Anna 10
3 AK F 1910 Margaret 8
4 AK F 1910 Helen 7

In [6]:
df.shape


Out[6]:
(5647426, 5)

In [7]:
df["Gender"].value_counts() # Instances per gender


Out[7]:
F    3154009
M    2493417
dtype: int64

In [8]:
# Calculate number of names of each gender per year
df_t = df.pivot_table(values="Count", index="Year", columns="Gender", aggfunc=np.sum)
df_t.sum(axis=0)


Out[8]:
Gender
F    143770075
M    155113251
dtype: int64

In [9]:
df_a = df_t.div(df_t.sum(axis=1), axis=0)*100
plt.plot(df_a.index, df_a["F"], label="Female", marker="o")
plt.plot(df_a.index, df_a["M"], label="Male", marker="s")
plt.xlabel("Year")
plt.ylabel("Percentage of names")
plt.legend(frameon=True, fancybox=True)


Out[9]:
<matplotlib.legend.Legend at 0x7f93bc1235d0>

In [10]:
# Get top gender names per state
df.sort(["Count"], ascending=False).groupby(["State", "Gender"]).head(1)\
.pivot_table(values=["Name", "Count", "Year"], columns="Gender", index="State", aggfunc=lambda x: x.iloc[0])\
.swaplevel(0, 1, axis=1).sort_index(axis=1)


Out[10]:
Gender F M
Count Name Year Count Name Year
State
AK 164 Jessica 1985 207 Michael 1982
AL 2694 Mary 1925 3038 James 1947
AR 1424 Linda 1948 1620 James 1947
AZ 875 Jessica 1987 1045 Michael 1984
CA 6949 Jessica 1991 8268 Michael 1956
CO 1036 Linda 1948 973 Michael 1970
CT 1155 Linda 1947 1622 Robert 1947
DC 615 Linda 1949 884 John 1947
DE 207 Jennifer 1974 311 Michael 1958
FL 2739 Jessica 1987 3491 Michael 1990
GA 2614 Mary 1922 3066 James 1947
HI 243 Jennifer 1981 325 Michael 1957
IA 2365 Linda 1947 1667 Robert 1924
ID 530 Linda 1947 348 Michael 1954
IL 5285 Linda 1947 6220 Michael 1957
IN 3012 Linda 1947 2781 Michael 1954
KS 1387 Linda 1947 1184 Michael 1954
KY 2537 Mary 1924 2553 James 1947
LA 1648 Linda 1947 1667 Michael 1958
MA 3035 Mary 1921 3895 Robert 1947
MD 1505 Linda 1947 1789 Michael 1957
ME 778 Linda 1947 649 Robert 1947
MI 4538 Linda 1947 4924 Michael 1957
MN 2144 Linda 1948 2043 David 1955
MO 2797 Linda 1947 2430 James 1947
MS 1840 Mary 1927 2278 James 1947
MT 472 Linda 1948 403 John 1947
NC 3126 Mary 1924 3884 James 1947
ND 496 Linda 1947 344 David 1952
NE 1055 Linda 1947 937 Robert 1924
NH 437 Linda 1947 447 Robert 1947
NJ 2398 Jennifer 1972 3699 John 1964
NM 589 Mary 1947 543 David 1961
NV 268 Jessica 1988 341 Michael 1990
NY 7542 Linda 1947 10023 Robert 1947
OH 5881 Linda 1947 5492 David 1955
OK 2064 Linda 1947 1273 James 1925
OR 1121 Linda 1947 984 Michael 1953
PA 8184 Mary 1918 8048 John 1947
RI 485 Mary 1921 682 Robert 1947
SC 1742 Mary 1924 2314 James 1947
SD 507 Linda 1948 413 Robert 1925
TN 2785 Linda 1947 3279 James 1947
TX 5060 Linda 1947 4983 Christopher 1985
UT 702 Jennifer 1972 657 Michael 1980
VA 2307 Linda 1948 2676 James 1947
VT 325 Linda 1947 262 David 1955
WA 1751 Linda 1948 1620 Michael 1954
WI 2399 Mary 1954 2400 David 1955
WV 1795 Linda 1947 1682 James 1947
WY 229 Linda 1948 194 David 1955

In [11]:
# Get top gender names per state
# Same as above but more efficient syntax
df.sort(["Count"], ascending=False).groupby(["State", "Gender"]).head(1)\
.pivot(index="State", columns="Gender").swaplevel(0, 1, axis=1).sort_index(axis=1)


Out[11]:
Gender F M
Count Name Year Count Name Year
State
AK 164 Jessica 1985 207 Michael 1982
AL 2694 Mary 1925 3038 James 1947
AR 1424 Linda 1948 1620 James 1947
AZ 875 Jessica 1987 1045 Michael 1984
CA 6949 Jessica 1991 8268 Michael 1956
CO 1036 Linda 1948 973 Michael 1970
CT 1155 Linda 1947 1622 Robert 1947
DC 615 Linda 1949 884 John 1947
DE 207 Jennifer 1974 311 Michael 1958
FL 2739 Jessica 1987 3491 Michael 1990
GA 2614 Mary 1922 3066 James 1947
HI 243 Jennifer 1981 325 Michael 1957
IA 2365 Linda 1947 1667 Robert 1924
ID 530 Linda 1947 348 Michael 1954
IL 5285 Linda 1947 6220 Michael 1957
IN 3012 Linda 1947 2781 Michael 1954
KS 1387 Linda 1947 1184 Michael 1954
KY 2537 Mary 1924 2553 James 1947
LA 1648 Linda 1947 1667 Michael 1958
MA 3035 Mary 1921 3895 Robert 1947
MD 1505 Linda 1947 1789 Michael 1957
ME 778 Linda 1947 649 Robert 1947
MI 4538 Linda 1947 4924 Michael 1957
MN 2144 Linda 1948 2043 David 1955
MO 2797 Linda 1947 2430 James 1947
MS 1840 Mary 1927 2278 James 1947
MT 472 Linda 1948 403 John 1947
NC 3126 Mary 1924 3884 James 1947
ND 496 Linda 1947 344 David 1952
NE 1055 Linda 1947 937 Robert 1924
NH 437 Linda 1947 447 Robert 1947
NJ 2398 Jennifer 1972 3699 John 1964
NM 589 Mary 1947 543 David 1961
NV 268 Jessica 1988 341 Michael 1990
NY 7542 Linda 1947 10023 Robert 1947
OH 5881 Linda 1947 5492 David 1955
OK 2064 Linda 1947 1273 James 1925
OR 1121 Linda 1947 984 Michael 1953
PA 8184 Mary 1918 8048 John 1947
RI 485 Mary 1921 682 Robert 1947
SC 1742 Mary 1924 2314 James 1947
SD 507 Linda 1948 413 Robert 1925
TN 2785 Linda 1947 3279 James 1947
TX 5060 Linda 1947 4983 Christopher 1985
UT 702 Jennifer 1972 657 Michael 1980
VA 2307 Linda 1948 2676 James 1947
VT 325 Linda 1947 262 David 1955
WA 1751 Linda 1948 1620 Michael 1954
WI 2399 Mary 1954 2400 David 1955
WV 1795 Linda 1947 1682 James 1947
WY 229 Linda 1948 194 David 1955

In [12]:
# Do for full data:
df_props = df.pivot_table(index=["Name", "State", "Year"], columns="Gender")["Count"].reset_index().fillna(0)

In [13]:
df_props.head()


Out[13]:
Gender Name State Year F M
0 Aaban NY 2013 0 6
1 Aaban NY 2014 0 6
2 Aadan CA 2008 0 7
3 Aadan CA 2009 0 6
4 Aadan CA 2014 0 5

In [14]:
df_props.columns


Out[14]:
Index([u'Name', u'State', u'Year', u'F', u'M'], dtype='object', name=u'Gender')

In [15]:
df_props.shape, df.shape


Out[15]:
((5448518, 5), (5647426, 5))

In [16]:
# Get top 20 names which have occured in the maximum number of years
df.groupby("Name")["Year"].apply(lambda x: len(x.unique())).sort(ascending=False, inplace=False).head(20)


Out[16]:
Name
Antoinette    105
Nellie        105
Dolores       105
Ivory         105
Ivy           105
Dixie         105
Nina          105
Amos          105
Jack          105
Jackie        105
Jackson       105
Nick          105
Jacob         105
Nicholas      105
Jacqueline    105
Nelson        105
Neil          105
Grant         105
Neal          105
Nathaniel     105
Name: Year, dtype: int64

In [17]:
df_props["Total"] = df_props[["F", "M"]].sum(axis=1)
df_props["MaxCount"] = df_props[["F", "M"]].max(axis=1)
df_props.head()


Out[17]:
Gender Name State Year F M Total MaxCount
0 Aaban NY 2013 0 6 6 6
1 Aaban NY 2014 0 6 6 6
2 Aadan CA 2008 0 7 7 7
3 Aadan CA 2009 0 6 6 6
4 Aadan CA 2014 0 5 5 5

In [31]:
# Write names per gender to file
df_props.to_csv("data/ssn_namesbystate_counts.txt", sep="\t", index=False)

In [18]:
(df_props[["F", "M"]].div(df_props["Total"], axis=0).max(axis=1) > 0.9).mean()


Out[18]:
0.97653949202333556

In [19]:
df_props[["F", "M"]].div(df_props["Total"], axis=0).idxmax(axis=1).head()


Out[19]:
0    M
1    M
2    M
3    M
4    M
dtype: object

In [20]:
df_props["BestGender"] = df_props[["F", "M"]].div(df_props["Total"], axis=0).idxmax(axis=1)

In [21]:
df_props.head()


Out[21]:
Gender Name State Year F M Total MaxCount BestGender
0 Aaban NY 2013 0 6 6 6 M
1 Aaban NY 2014 0 6 6 6 M
2 Aadan CA 2008 0 7 7 7 M
3 Aadan CA 2009 0 6 6 6 M
4 Aadan CA 2014 0 5 5 5 M

In [22]:
df["Name"].unique().shape


Out[22]:
(30274,)

In [23]:
df_props[df_props["Total"] > 100].shape


Out[23]:
(575161, 8)

In [24]:
df[(df["Name"] == "Aaban") & (df["State"] == "NY") & (df["Year"] == 2013)]


Out[24]:
State Gender Year Name Count
3865866 NY M 2013 Aaban 6

In [25]:
df_props.describe()


Out[25]:
Year F M Total MaxCount
count 5448518.000000 5448518.000000 5448518.000000 5448518.000000 5448518.000000
mean 1972.316033 26.387006 28.468888 54.855894 54.256168
std 29.617741 109.590313 152.552096 185.000905 183.863425
min 1910.000000 0.000000 0.000000 5.000000 5.000000
25% 1949.000000 0.000000 0.000000 7.000000 7.000000
50% 1977.000000 6.000000 0.000000 13.000000 13.000000
75% 1999.000000 15.000000 12.000000 35.000000 35.000000
max 2014.000000 8184.000000 10023.000000 10046.000000 10023.000000

In [26]:
df_props[(df_props["Total"] > 500) & ((df_props["Total"]/df_props["MaxCount"]) == 1)].shape


Out[26]:
(59777, 8)

In [27]:
# Filter names to only include names which occur in 500 times per year and are 100% of a given gender
df_props_filtered = df_props[(df_props["Total"] > 500) & ((df_props["Total"]/df_props["MaxCount"]) == 1)].copy()
df_props_filtered.to_csv("data/ssn_namesbystate_counts.filtered.txt", sep="\t", index=False)

In [28]:
# Create one dataset for getting only names and genders

df_names = df_props.groupby("Name")[["F", "M"]].sum().reset_index()
df_names.head()


Out[28]:
Gender Name F M
0 Aaban 0 12
1 Aadan 0 23
2 Aadarsh 0 5
3 Aaden 0 3426
4 Aadhav 0 6

In [29]:
df_names["Total"] = df_names[["F", "M"]].sum(axis=1)
df_names["MaxCount"] = df_names[["F", "M"]].max(axis=1)
df_names["BestGender"] = df_names[["F", "M"]].div(df_names["Total"], axis=0).idxmax(axis=1)
df_names.head()


Out[29]:
Gender Name F M Total MaxCount BestGender
0 Aaban 0 12 12 12 M
1 Aadan 0 23 23 23 M
2 Aadarsh 0 5 5 5 M
3 Aaden 0 3426 3426 3426 M
4 Aadhav 0 6 6 6 M

In [93]:
# First add start and end characters to identify beginning and end of chars
df_props_filtered["Name_proc"] = df_props_filtered.Name.apply(lambda x: ("^%s$" % x.strip()))
df_props_filtered[["Name", "Year","State","Name_proc", "F", "M", "MaxCount","Total","BestGender"]].to_csv("data/ssn_namesbystate_counts.filtered.txt", sep="\t", index=False)


df_names["Name_proc"] = df_names.Name.apply(lambda x: "^%s$" % x.strip())
#df_names.head()
df_names[["Name","Name_proc", "F", "M", "MaxCount","Total","BestGender"]].to_csv("data/ssn_names.txt", sep="\t", index=False)

# Get latitude and longitudes for each state
df_states_latlong = pd.read_csv("data/state_latlon.csv")

df_states_latlong.columns = ["State", "Latitude", "Longitude"]

df_names_latlong = pd.merge(df_props_filtered, df_states_latlong, how="left", on="State")
#pd.merge(df, df_states_latlong, how="left", on="State").to_csv("data/SSN_names_state.latlong.txt", index=False, sep="\t")

In [95]:
df_props_year = df_props.groupby(["Name", "Year"]).sum().reset_index()

In [96]:
print df_props_year.shape
df_props_year.head()


(548154, 6)
Out[96]:
Gender Name Year F M Total MaxCount
0 Aaban 2013 0 6 6 6
1 Aaban 2014 0 6 6 6
2 Aadan 2008 0 12 12 12
3 Aadan 2009 0 6 6 6
4 Aadan 2014 0 5 5 5

In [99]:
df_props_year["BestGender"] = df_props_year[["F", "M"]].div(df_props_year["Total"], axis=0).idxmax(axis=1)
df_props_year.head()


Out[99]:
Gender Name Year F M Total MaxCount BestGender
0 Aaban 2013 0 6 6 6 M
1 Aaban 2014 0 6 6 6 M
2 Aadan 2008 0 12 12 12 M
3 Aadan 2009 0 6 6 6 M
4 Aadan 2014 0 5 5 5 M

In [104]:
gg = df_props_year[df_props_year.Total > 1000].groupby(["Year", "BestGender"])

In [109]:
#gg.filter(lambda x: x.Total > (np.random.rand(x.Total.shape[0])*x.Total.max())).head()

Training the model on the filtered data

We will first use a simple logisitic regression model char features


In [34]:
cvec = CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)
cvec.fit(df_names.Name_proc.values)


Out[34]:
CountVectorizer(analyzer='char', binary=True, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(2, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [35]:
cvec.transform(df_names.Name_proc.values[:10])


Out[35]:
<10x6895 sparse matrix of type '<type 'numpy.int64'>'
	with 133 stored elements in Compressed Sparse Row format>

In [36]:
model = LogisticRegression(penalty="l1")

np.random.seed(1337)
shuffle_ids = np.random.permutation(df_names.shape[0])
split_point = int(len(shuffle_ids)*0.6)
train_ids, test_ids = shuffle_ids[:split_point], shuffle_ids[split_point:]
train_ids.shape, test_ids.shape

X_train, y_train = cvec.transform(df_names.iloc[train_ids].Name_proc.values), df_names.iloc[train_ids].BestGender
X_test, y_test = cvec.transform(df_names.iloc[test_ids].Name_proc.values), df_names.iloc[test_ids].BestGender

#X_train, X_test, y_train, y_test = cross_validation.train_test_split(cvec.transform(df_names.Name_proc.values),df_names.BestGender, test_size=0.4, random_state=100)

model.fit(X_train, y_train)


Out[36]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [37]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape


Out[37]:
((18164, 6895), (18164,), (12110, 6895), (12110,))

In [38]:
y_pred = model.predict(X_test)

In [39]:
print classification_report(y_pred, y_test)


             precision    recall  f1-score   support

          F       0.90      0.91      0.90      7330
          M       0.86      0.84      0.85      4780

avg / total       0.88      0.88      0.88     12110


In [42]:
pd.concat((df_names.iloc[test_ids][["Name", "BestGender"]], pd.Series(y_pred, name="Predicted", index=test_ids)), axis=1).head()


Out[42]:
Gender Name BestGender Predicted
24482 Santanna F F
19861 Merisa F F
23753 Roneisha F F
25650 Sheyla F F
3830 Boston M M

In [43]:
df_model_coeffs = pd.DataFrame({"Feature": cvec.get_feature_names(), "Coeff": model.coef_[0]})

In [44]:
df_model_coeffs["Feature_len"] = df_model_coeffs["Feature"].apply(lambda x: len(x))
df_model_coeffs.sort("Coeff", ascending=False).head(20)


Out[44]:
Coeff Feature Feature_len
1258 4.605880 Shlo 4
6103 3.106888 siah 4
1509 3.013812 ^Abd 4
5109 2.542565 ndel 4
5638 2.505959 rag 3
6844 2.485604 zai 3
6404 2.466336 uda 3
4613 2.463779 kw 2
938 2.416480 Mark 4
6592 2.324890 vil 3
2430 2.314680 ^Zac 4
5561 2.296611 ovan 4
4003 2.264858 hay 3
5543 2.264134 ota 3
6773 2.257093 yne$ 4
975 2.242189 Merl 4
6275 2.241032 tho 3
934 2.231238 Marc 4
5486 2.209771 ord 3
2797 2.203274 anni 4

In [45]:
df_names[df_names.Name_proc.str.contains("ndel")]


Out[45]:
Gender Name F M Total MaxCount BestGender Name_proc
1735 Andelyn 10 0 10 10 F ^Andelyn$
3761 Blondell 631 0 631 631 F ^Blondell$
3896 Brandel 6 0 6 6 F ^Brandel$
4003 Breindel 69 0 69 69 F ^Breindel$
4615 Candelaria 1328 0 1328 1328 F ^Candelaria$
4616 Candelario 0 1565 1565 1565 M ^Candelario$
10368 Glendel 0 20 20 20 M ^Glendel$
10369 Glendell 0 24 24 24 M ^Glendell$
11084 Hendel 29 0 29 29 F ^Hendel$
11243 Hindel 5 0 5 5 F ^Hindel$
12551 Jandel 0 51 51 51 M ^Jandel$
15409 Kendel 15 20 35 20 M ^Kendel$
15410 Kendell 235 1683 1918 1683 M ^Kendell$
15924 Kindell 5 0 5 5 F ^Kindell$
16403 Kyndel 40 0 40 40 F ^Kyndel$
16404 Kyndell 15 0 15 15 F ^Kyndell$
17597 Lindell 0 906 906 906 M ^Lindell$
17791 Londell 0 11 11 11 M ^Londell$
18150 Lyndel 0 5 5 5 M ^Lyndel$
18151 Lyndell 27 245 272 245 M ^Lyndell$
18718 Mandel 0 5 5 5 M ^Mandel$
18719 Mandela 0 10 10 10 M ^Mandela$
19823 Mendel 0 1383 1383 1383 M ^Mendel$
20156 Mindel 49 0 49 49 F ^Mindel$
22622 Quandell 0 5 5 5 M ^Quandell$
22684 Quindell 0 5 5 5 M ^Quindell$
22962 Randel 0 1139 1139 1139 M ^Randel$
22963 Randell 0 6228 6228 6228 M ^Randell$
23310 Rendell 0 11 11 11 M ^Rendell$
23746 Rondel 0 5 5 5 M ^Rondel$
23747 Rondell 0 649 649 649 M ^Rondell$
24915 Shaindel 483 0 483 483 F ^Shaindel$
25073 Shandel 11 0 11 11 F ^Shandel$
25074 Shandell 26 0 26 26 F ^Shandell$
25393 Shawndell 7 21 28 21 M ^Shawndell$
25472 Sheindel 31 0 31 31 F ^Sheindel$
25767 Shondell 13 18 31 18 M ^Shondell$
28818 Vondell 10 0 10 10 F ^Vondell$
28948 Wendel 0 143 143 143 M ^Wendel$
28949 Wendelin 5 21 26 21 M ^Wendelin$
28950 Wendell 0 48363 48363 48363 M ^Wendell$
28951 Wendelyn 5 0 5 5 F ^Wendelyn$
29078 Windell 0 1076 1076 1076 M ^Windell$
29149 Wyndell 0 15 15 15 M ^Wyndell$
29320 Yandel 0 2509 2509 2509 M ^Yandel$
29321 Yandell 0 5 5 5 M ^Yandell$

In [ ]:
df_sample = df_props_filtered[["Name", "Year","State","Name_proc", "F", "M", "MaxCount","Total","BestGender"]]

Train model on name character ngram + year feature

Add year feature


In [47]:
cvec = CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)
cvec.fit(df_props_filtered.Name_proc)
cvec.transform(df_props_filtered.Name_proc.head()).todense().shape


Out[47]:
(5, 3098)

In [48]:
class ColumnFeatures(TransformerMixin):
    def __init__(self, colname, to_df=True):
        print "Initialized extractor for column %s" % colname
        self.colname = colname
        self.to_df = to_df
    def get_feature_names(self):
        return [self.colname]
    def transform(self, X, **transform_params):
        print "Extracting column [%s], to_df = %s" % (self.colname, self.to_df)
        if self.to_df:
            return pd.DataFrame(X[self.colname])
        return X[self.colname]
    def fit(self, X, y=None, **fit_params):
        return self


class IdentityTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        print "X processed by parent. Output shape: %s" % (X.shape, )
        return X
    def fit(self, X, y=None, **fit_params):
        return self
    
class DenseTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        print "New shape: ",  X.todense().shape
        return X.todense()
    def fit(self, X, y=None, **fit_params):
        return self
    

class MultiColumnExtractor(TransformerMixin):
    def __init__(self, colnames):
        print "Initialized extractor for column %s" % colnames
        self.colnames = colnames
    def get_feature_names(self):
        return self.colnames
    def transform(self, X, **transform_params):
        print "Extracting columns [%s]" % (self.colnames,)
        return pd.DataFrame(X[self.colnames])
    def fit(self, X, y=None, **fit_params):
        return self
        
pipeline = Pipeline([
        ("features", FeatureUnion([
                    ("year",  Pipeline([
                                ("year_val", ColumnFeatures(colname="Year")),
                                ("year_norm", StandardScaler())
                    ])),
                    ("state", Pipeline([
                               ("state_val", ColumnFeatures(colname="State")),
                               ("state_cat", LabelBinarizer())
                    ])),
                    ("names", Pipeline([
                               ("name_val", ColumnFeatures(colname="Name_proc", to_df=False)),
                               ("name_ngram", CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)),
                    ])),
                    
                ]))
    ])


Initialized extractor for column Year
Initialized extractor for column State
Initialized extractor for column Name_proc

In [49]:
pipeline.fit_transform(df_props_filtered).shape


Extracting column [Year], to_df = True
Extracting column [State], to_df = True
Extracting column [Name_proc], to_df = False
/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py:498: UserWarning: StandardScaler assumes floating point values as input, got int64
  "got %s" % (estimator, X.dtype))
Out[49]:
(59777, 3141)

In [50]:
df_props_filtered.values[:10]


Out[50]:
array([['Aaliyah', 'CA', 2009, 556.0, 0.0, 556.0, 556.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'CA', 2010, 563.0, 0.0, 563.0, 563.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'CA', 2011, 678.0, 0.0, 678.0, 678.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'CA', 2012, 737.0, 0.0, 737.0, 737.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'CA', 2013, 706.0, 0.0, 706.0, 706.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'CA', 2014, 695.0, 0.0, 695.0, 695.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'TX', 2010, 555.0, 0.0, 555.0, 555.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'TX', 2011, 549.0, 0.0, 549.0, 549.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'TX', 2012, 682.0, 0.0, 682.0, 682.0, 'F', '^Aaliyah$'],
       ['Aaliyah', 'TX', 2013, 603.0, 0.0, 603.0, 603.0, 'F', '^Aaliyah$']], dtype=object)

In [51]:
model = LogisticRegression(penalty="l1")

np.random.seed(1337)
shuffle_ids = np.random.permutation(df_names.shape[0])
split_point = int(len(shuffle_ids)*0.6)
train_ids, test_ids = shuffle_ids[:split_point], shuffle_ids[split_point:]
train_ids.shape, test_ids.shape

X_train, y_train = pipeline.transform(df_props_filtered.iloc[train_ids]), df_props_filtered.iloc[train_ids].BestGender
X_test, y_test = pipeline.transform(df_props_filtered.iloc[test_ids]), df_props_filtered.iloc[test_ids].BestGender


Extracting column [Year], to_df = True
Extracting column [State], to_df = True
Extracting column [Name_proc], to_df = False
Extracting column [Year], to_df = True
Extracting column [State], to_df = True
Extracting column [Name_proc], to_df = False

In [52]:
model.fit(X_train, y_train)


Out[52]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [53]:
y_pred = model.predict(X_test)
print classification_report(y_pred, y_test)


             precision    recall  f1-score   support

          F       1.00      1.00      1.00      5723
          M       1.00      1.00      1.00      6387

avg / total       1.00      1.00      1.00     12110


In [54]:
f = pipeline.named_steps["features"]
df_model_coeffs = pd.DataFrame({"Feature": ["Year"] + f.transformer_list[1][1].named_steps["state_cat"].classes_.tolist() + f.transformer_list[2][1].named_steps["name_ngram"].get_feature_names(), "Coeff": model.coef_[0]})

In [55]:
df_model_coeffs.sort("Coeff", ascending=True).head(20)


Out[55]:
Coeff Feature
1253 -12.382296 a$
2387 -6.719285 ne
2984 -6.560792 ud
2031 -6.517110 ie$
1958 -6.246937 hl
2955 -6.048778 tt
171 -5.078938 Caro
82 -4.331077 Am
3130 -4.048076 ys
448 -3.754964 Joan
1929 -3.567873 he
1258 -3.542252 abe
2295 -3.527388 ly
427 -3.521152 Jea
1038 -3.485361 ^Jo$
2330 -3.472253 min$
2258 -3.391106 lle
1714 -3.378617 el
1912 -3.346282 h$
845 -3.209122 ^Ann

In [61]:
pipeline = Pipeline([
        ("features", FeatureUnion([
                    ("year_latlong",  Pipeline([
                                ("year_val", MultiColumnExtractor(colnames=["Year", "Latitude", "Longitude"])),
                                ("year_norm", StandardScaler())
                    ])),
                    ("names", Pipeline([
                               ("name_val", ColumnFeatures(colname="Name_proc", to_df=False)),
                               ("name_ngram", CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)),
                    ])),
                    
                ]))
    ])


Initialized extractor for column ['Year', 'Latitude', 'Longitude']
Initialized extractor for column Name_proc

In [62]:
model = LogisticRegression(penalty="l1")

np.random.seed(1337)
shuffle_ids = np.random.permutation(df_names.shape[0])
split_point = int(len(shuffle_ids)*0.6)
train_ids, test_ids = shuffle_ids[:split_point], shuffle_ids[split_point:]
train_ids.shape, test_ids.shape

X_train, y_train = pipeline.fit_transform(df_names_latlong.iloc[train_ids]), df_names_latlong.iloc[train_ids].BestGender
X_test, y_test = pipeline.transform(df_names_latlong.iloc[test_ids]), df_names_latlong.iloc[test_ids].BestGender


Extracting columns [['Year', 'Latitude', 'Longitude']]
Extracting column [Name_proc], to_df = False
Extracting columns [['Year', 'Latitude', 'Longitude']]
Extracting column [Name_proc], to_df = False

In [63]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print classification_report(y_pred, y_test)


             precision    recall  f1-score   support

          F       1.00      1.00      1.00      5723
          M       1.00      1.00      1.00      6387

avg / total       1.00      1.00      1.00     12110


In [64]:
df_props.head()


Out[64]:
Gender Name State Year F M Total MaxCount BestGender
0 Aaban NY 2013 0 6 6 6 M
1 Aaban NY 2014 0 6 6 6 M
2 Aadan CA 2008 0 7 7 7 M
3 Aadan CA 2009 0 6 6 6 M
4 Aadan CA 2014 0 5 5 5 M

In [65]:
f = pipeline.named_steps["features"]
df_model_coeffs = pd.DataFrame({"Feature": f.transformer_list[0][1].named_steps["year_val"].get_feature_names()\
                                + f.transformer_list[1][1].named_steps["name_ngram"].get_feature_names(),\
                                "Coeff": model.coef_[0]})

In [66]:
f = pipeline.named_steps["features"]

In [67]:
f.transformer_list[0][1].named_steps["year_val"].get_feature_names()


Out[67]:
['Year', 'Latitude', 'Longitude']

In [68]:
df_model_coeffs.head()


Out[68]:
Coeff Feature
0 0.706100 Year
1 -0.073698 Latitude
2 -0.102547 Longitude
3 0.000000 Aa
4 0.000000 Aar

In [69]:
df_t = df_props.groupby("Name")[["F", "M"]].max()
df_t.head()


Out[69]:
Gender F M
Name
Aaban 0 6
Aadan 0 7
Aadarsh 0 5
Aaden 0 158
Aadhav 0 6

In [70]:
df_t["Prop"] = np.abs(df_t["F"] - df_t["M"]) / (df_t["F"] + df_t["M"])
df_t[(df_t["Prop"] < 0.1) & ((df_t["F"] + df_t["M"]) > 100)].sort("Prop")


Out[70]:
Gender F M Prop
Name
Unnamed 79 79 0.000000
Notnamed 108 109 0.004608
Justice 80 79 0.006289
Charley 55 54 0.009174
Infant 544 560 0.014493
Emory 65 67 0.015152
Johnnie 268 257 0.020952
Stevie 52 55 0.028037
Tommie 117 124 0.029046
Jaylin 64 59 0.040650
Unknown 136 149 0.045614
Armani 59 65 0.048387
Darian 83 69 0.092105
Jaime 753 621 0.096070
Frankie 117 96 0.098592

In [71]:
df_t = df_props.groupby(["Name", "Year"])[["F", "M"]].sum().reset_index()
df_t["BestGender"] = df_t[["F", "M"]].idxmax(axis=1)

In [72]:
df_t.head()


Out[72]:
Gender Name Year F M BestGender
0 Aaban 2013 0 6 M
1 Aaban 2014 0 6 M
2 Aadan 2008 0 12 M
3 Aadan 2009 0 6 M
4 Aadan 2014 0 5 M

In [80]:
clust_model = AgglomerativeClustering(n_clusters=10)

In [81]:
cids = clust_model.fit_predict(cvec.transform(df_names.Name_proc.head(100)).todense())

In [82]:
cids


Out[82]:
array([0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 6, 6,
       6, 9, 6, 6, 9, 5, 9, 5, 6, 6, 8, 8, 8, 8, 5, 0, 0, 0, 5, 0, 1, 1, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 2, 1, 1, 1, 4, 4, 4, 4, 4, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 7, 0, 0, 0, 3,
       3, 3, 3, 3, 3, 3, 3, 3])

In [90]:
pd.concat((df_names.Name.head(100), pd.Series(cids, name="ClusterIDs", index=df_names.Name_proc.head(100).index)), axis=1).head()


Out[90]:
Name ClusterIDs
0 Aaban 0
1 Aadan 0
2 Aadarsh 0
3 Aaden 7
4 Aadhav 0

In [ ]: